# -*- coding: utf-8 -*-
"""
Created on Fri Apr 20 09:38:07 2018

@author: lizheng
"""
import pandas as pd
import sys
#sys.path.append("D:/PDM/SPBM")
sys.path.append("/root/lizheng")
#sys.path.append("C:/Users/my/Desktop")
import spbm_model11 as sp
import jieba
import jieba.analyse
#jieba.enable_parallel()#并行分词 仅用于linux系统
jieba.load_userdict('/root/lizheng/ciku.txt')
from pyspark.sql import SparkSession
from sklearn.externals import joblib
import numpy as np
SPARK_VERSION=2      
APP_NAME = 'lz_pyspark,' + str(SPARK_VERSION)

def load_data(file):
    spark = SparkSession.builder.appName(APP_NAME).enableHiveSupport().getOrCreate()
    df = pd.read_csv(file,dtype = 'str')
    from multiprocessing import Pool
    p = Pool(30)
    df['hwmc'] = p.map(sp.cut_jieba,list(df['HWMC'].astype(str)))
    p.close()
    p.join()
    spark_df = spark.createDataFrame(df)
    return spark_df

if __name__ == "__main__":
    spark = SparkSession.builder.appName(APP_NAME).enableHiveSupport().getOrCreate()
    df = load_data('/root/lizheng/comparison/TradingEnterprise/income/2016.csv')
    clf = joblib.load('/root/lizheng/model/Cleandata180307.pkl.gz')
    spbmpro = df.rdd.map(lambda p:clf.predict(sp.get_hv(p.hwmc)))#求编码